In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display_html, HTML
import urllib
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
letters = re.compile('[a-zA-Z]')
def df_to_html(df):
display_html(HTML(df.to_html()))
def load_gutenberg_book(url, char_limit=10000, min_len_of_sections=40):
"""
Returns a list of paragraphs in the book.
url: A url from Project Gutenberg.
char_limit: Amount of characters of the book to read.
min_len_of_sections: Each paragraph must be at least this many characters long.
"""
book = urllib.urlopen(url)
book_text = book.read(char_limit if char_limit else -1)
result = []
for text in book_text[:char_limit].split("\r\n\r\n"):
if len(text) >= min_len_of_sections:
clean_text = text.replace("\r\n", " ").strip()
result.append(clean_text)
start_position = len(result) if len(result) < 6 else 6
return result[start_position:]
def get_text(path):
"""
Handle all the weird ways books are encoded.
"""
encoding_options = "ascii utf-8 utf-16 utf-32 utf-16-be utf-16-le utf-32-be utf-32-le".split()
for encoding in encoding_options:
try:
with open(path, encoding=encoding) as book:
return book.read()
except UnicodeDecodeError:
continue
raise ValueError
def extract_term(term_indicator, text, default=None, max_term_size=75):
term_start = text.find(term_indicator)
# If not found, return default.
if term_start == -1:
term = default
else:
term_end = text.find("\n", term_start)
term = text[term_start+len(term_indicator):term_end].strip()
if term and (len(term) > max_term_size):
term = default
return term
def get_author_and_title(book_text, title_case=True):
title = extract_term("Title:", book_text, default=None)
author = extract_term("Author:", book_text, default=None)
# Solve for other strange author name formatting
for term_indicator in ["\n\nby ", "\n\nOF ", "\nOF\n"]:
if author is None:
author = extract_term(term_indicator, book_text[:15000], max_term_size=25)
if title_case and title and author:
title, author = title.title(), author.title()
return title, author
def locate_beginning_of_text(title, author, text):
location = text.find("START OF THIS PROJECT GUTENBERG") + 20
if location < 0:
if title:
location = text.find(title)
if author:
location = text.find(author)
return location
def locate_end_of_text(text):
f = text.find
search_terms = ["End of Project Gutenberg",
"END OF THIS PROJECT GUTENBERG EBOOK",
"END OF THE PROJECT GUTENBERG EBOOK",
"End of the Project Gutenberg Etext"]
location = max([f(term) for term in search_terms])
if location < 0:
print("Fail")
location = None
return location
def parse_book(book_text, min_paragraph_characters=100):
"""
Given the text of a book, returns a list of dictionaries with the keys:
{title, author, contents, part, hash}
"""
parsed_book_paragraphs = []
title, author = get_author_and_title(book_text)
text_starts = locate_beginning_of_text(title, author, book_text)
text_ends = locate_end_of_text(book_text)
book_paragraphs = book_text[text_starts:text_ends].split("\n\n")
for paragraph_number, raw_paragraph in enumerate(book_paragraphs):
paragraph = raw_paragraph.replace("\n", " ").strip()
if (len(paragraph) < min_paragraph_characters) or not re.search(letters, paragraph):
continue
if "gutenberg" in paragraph.lower() or "chapter" in paragraph.lower():
continue
book_data = {"title": title,
"author": author,
"contents": paragraph,
"part": paragraph_number}
parsed_book_paragraphs.append(book_data)
return parsed_book_paragraphs
def get_list_of_book_paths(book_directory):
return list(glob.iglob(book_directory + '/*.txt'))
def books_to_pandas(book_directory, min_paragraph_characters=100):
paragraphs = []
for filename in get_list_of_book_paths(book_directory):
book_text = get_text(filename)
parsed_book = parse_book(book_text, min_paragraph_characters)
paragraphs.extend(parsed_book)
return pd.DataFrame(paragraphs)
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
def cosine_similarity(new_docs, old_docs):
"""
Returns a similarity matrix where the first row is an array of
similarities of the first new_doc compared with each of the old
docs.
"""
return new_docs*old_docs.T
def find_closest_matches(similarity_matrix, n_matches_to_return=1):
"""
Expects a dense array of the form [[1., .5, .2],
[.3, 1., .1],
[.2, .4, 1.]]
where rows correspond to similarities.
"""
top_indices = np.apply_along_axis(func1d=lambda x: x.argsort()[-n_matches_to_return:][::-1],
axis=1,
arr=similarity_matrix)
return top_indices
simple_cache = {}
def search_book(paragraph, book_title, books, n_results=10, print_results=False, return_title=False):
book_title_list = book_title if isinstance(book_title, (list, tuple)) else (book_title,)
select_books = books[books.title.isin(book_title_list)].reset_index()
contents = select_books.contents
if book_title not in simple_cache:
vectorizer = TfidfVectorizer(max_df=.7, min_df=.0001, tokenizer=LemmaTokenizer()).fit(contents)
simple_cache[book_title] = {"vectorizer": vectorizer,
"vect_book": vectorizer.transform(contents)}
vectorizer = simple_cache[book_title]["vectorizer"]
vect_book = simple_cache[book_title]["vect_book"]
vect_paragraph = vectorizer.transform([paragraph])
nbrs = NearestNeighbors(n_neighbors=n_results, algorithm='brute').fit(vect_book)
distances, indices = nbrs.kneighbors(vect_paragraph)
search_results = list(zip(distances[0], select_books.ix[indices[0]].contents, select_books.ix[indices[0]].title))
if print_results:
for dist, text, title in search_results:
print(dist)
print(text)
print("\n")
if return_title:
return search_results
return [(dist, text) for dist, text, title in search_results]
def compare_book_paragraphs(book_title, books, n_close_matches=10, same_book_in_corpus=True):
results = []
book_title_list = book_title if isinstance(book_title, (list, tuple)) else (book_title,)
select_books = books[books.title.isin(book_title_list)].reset_index()
for paragraph in select_books.contents:
if same_book_in_corpus:
result = search_book(paragraph, book_title, books, n_results=2)[1]
else:
result = search_book(paragraph, book_title, books, n_results=1)[0]
results.append([paragraph] + list(result))
df = pd.DataFrame(results, columns=["Text 1", "Distance", "Text 2"])
df.sort_values("Distance", inplace=True)
print("Perfect matches")
perfect_matches = df[df.Distance == 0].drop_duplicates()
df_to_html(perfect_matches)
print("\n")
print ("Close matches")
top_close_matches = df[df.Distance != 0].drop_duplicates("Distance").head(n_close_matches)
df_to_html(top_close_matches)
def compare_book_to_books(book_title, other_book_titles, books, n_close_matches=20):
results = []
if book_title in other_book_titles:
other_book_titles = tuple([title for title in other_book_titles if title != book_title])
select_books = books[books.title.isin(other_book_titles)].reset_index()
book = books[books.title == book_title].reset_index()
for paragraph in book.contents:
result = search_book(paragraph, other_book_titles, books, n_results=1, return_title=True)[0]
results.append([paragraph] + list(result))
df = pd.DataFrame(results, columns=["Text 1", "Distance", "Text 2", "Title"])
df.sort_values("Distance", inplace=True)
print("Perfect matches")
perfect_matches = df[df.Distance == 0].drop_duplicates()
df_to_html(perfect_matches)
print("\n")
print ("Close matches")
close_matches = df[df.Distance != 0]
df_to_html(close_matches.drop_duplicates("Distance").head(n_close_matches))
return close_matches
In [2]:
books = books_to_pandas("popular_books", min_paragraph_characters=1)
In [3]:
books.head()
Out[3]:
In [4]:
books.title.value_counts()
Out[4]:
Number of comparisons ~ 5000
In [5]:
%%time
paragraph = "alice doesn't know which way to go"
book_title = "Alice'S Adventures In Wonderland"
search_book(paragraph, book_title, books, n_results=5, print_results=True)
In [6]:
%%time
paragraph = "queen says off with his or her head"
book_title = "Alice'S Adventures In Wonderland"
search_book(paragraph, book_title, books, n_results=5, print_results=True)
Number of comparisons ~ 25 million
In [7]:
# See the entire string when printing a data frame
pd.set_option('display.max_colwidth', -1)
In [8]:
%%time
compare_book_paragraphs(book_title, books)
Number of comparisons ~ 500 million
In [9]:
all_book_titles = books.title.unique().tolist()
In [10]:
%%time
book_title = "Alice'S Adventures In Wonderland"
close_matches = compare_book_to_books(book_title, all_book_titles, books)
In [11]:
close_matches[:100].Title.value_counts()
Out[11]:
In [12]:
%%time
book_title = "The Adventures Of Sherlock Holmes"
close_matches = compare_book_to_books(book_title, all_book_titles, books)
In [13]:
close_matches[:200].Title.value_counts()
Out[13]:
Number of comparisons > 13 billion
In [18]:
%%time
def compare_all_books(books, n_close_matches=20):
vectorizer = TfidfVectorizer(max_df=.7, min_df=.0001, tokenizer=LemmaTokenizer()).fit(books.contents)
vect_book = vectorizer.transform(books.contents)
results = {"book_1_title":[],
"book_1_paragraph":[],
"book_2_title":[],
"book_2_paragraph":[],
"paragraph_distance":[]}
book_titles = books.title.dropna().unique().tolist()
for book_title in book_titles:
book_mask = (books.title == book_title).values
other_book_mask = ~book_mask
nbrs = NearestNeighbors(n_neighbors=1, algorithm='brute').fit(vect_book[other_book_mask])
distances, indices = nbrs.kneighbors(vect_book[book_mask])
book_content = books.loc[book_mask, "contents"].tolist()
results["book_1_paragraph"].extend(book_content)
matches = books[other_book_mask].contents.values[indices.flatten()]
results["book_2_paragraph"].extend(matches)
book_1_title = [book_title] * sum(book_mask)
results["book_1_title"].extend(book_1_title)
book_2_title = books[other_book_mask].title.values[indices.flatten()]
results["book_2_title"].extend(book_2_title)
results["paragraph_distance"].extend(distances.flatten())
results_sorted_by_distance = pd.DataFrame(results).sort_values("paragraph_distance")
return results_sorted_by_distance
results = compare_all_books(books)
In [55]:
results[results.paragraph_distance == 0].drop_duplicates()[:100]
Out[55]:
In [54]:
results[results.paragraph_distance > 0].drop_duplicates("paragraph_distance")[:100]
Out[54]: